srun -A bch -p bch-interactive --mem=16GB --qos=interactive --pty /bin/bash
source /programs/biogrids.shrc
export R_X=4.2.1

library(Seurat)
library(tidyverse)
library(WGCNA)
library(umap)

ms_seurat <- CreateSeuratObject(counts=ms.dat,min.cells=3,min.features=200)
ms_seurat <- NormalizeData(ms_seurat,normalization.method="LogNormalize",scale.factor=1000,verbose=FALSE)

#find percentage of mitochondrial genes in each cell
ens <- read.delim("mart_export.txt")
g <- duplicated(ens[,1])
ens <- ens[!g,]
dimnames(ens)[[1]] <- ens[,1]
mito_genes <- tolower(substr(ens[,6],1,3)) == "mt-"
mito_genes <- ens[mito_genes,1]
x <- mito_genes %in% dimnames(ms_seurat$RNA)[[1]]
mito_genes <- mito_genes[x]
ms_seurat[["percent_mt"]] <- PercentageFeatureSet(ms_seurat,features=mito_genes)

#filter out low quality cells, find variable genes
ms_filtered <- subset(ms_seurat, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent_mt < 5)
ms_filtered <- FindVariableFeatures(ms_filtered, selection.method = "vst", nfeatures = 5000)

#clustering using WGCNA
n <- t(as.matrix(ms_filtered$RNA[VariableFeatures(ms_filtered),]))
a <- cor(n)
b <- a < 0
a[b] <- 0
tom <- TOMdist(a)
clust <- hclust(as.dist(tom),method="average")
m <- cutreeDynamic(clust,distM=tom,deepSplit=4)
me <- moduleEigengenes(n,m)

clust_umap <- umap(me[[1]][,2:19])

#clusters
x <- hclust(dist(clust_umap$layout),method="average")
k <- rect.hclust(x,k=10)
clust_colors <- rainbow(length(k))
clust_cols <- rep("",dim(clust_umap$layout)[[1]])
clust_num <- rep(0,dim(clust_umap$layout)[[1]])
for(i in c(1:length(k))){
	clust_cols[k[[i]]] <- clust_colors[i]
	clust_num[k[[i]]] <- i
}
plot(clust_umap$layout,col=clust_cols,pch=20,cex=.25)

save(clust_colors,clust_cols,clust_num,file="clusters.data")

#plot by library
cell_bc <- dimnames(clust_umap$layout)[[1]]
x <- unlist(strsplit(cell_bc,"-",fixed=TRUE))
lib_index <- x[seq(2,length(x),by=2)]
lib_colors <- rainbow(length(unique(lib_index)))
lib_cols <- rep("",dim(clust_umap$layout)[[1]])
for(i in c(1:length(lib_colors))){
	x <- lib_index == i
	lib_cols[x] <- lib_colors[i]
}
plot(clust_umap$layout,col=lib_cols,pch=20,cex=.25)

#Cluster markers
Idents(ms_filtered) <- clust_num
for(i in c(1:10)){
	d <- FindMarkers(ms_filtered,ident.1=i,ident.2=NULL,only.pos=TRUE,logfc.threshold=0,min.pct=0,min.diff.pct=0,thresh.test=0)
	d <- cbind(dimnames(d)[[1]],d)
	cluster_markers <- merge(d,ens[,c(1,6)],all.x=TRUE,by=1)
	filename <- paste("./de_clust/cluster",i,"_markers.csv",sep="")
	write.table(cluster_de,filename,sep=",")
	print(i)
}

#plot by tissue origin
geno_type <- substr(dimnames(ms_exp)[[2]],1,1)
geno_cols <- rep("",length(geno_type))
geno_cols[geno_type=="C"] <- "blue"
geno_cols[geno_type=="K"] <- "red"
plot(clust_umap$layout,col=geno_cols,pch=20,cex=.25)

x <- table(paste(clust_num,geno_type,sep="_"))
cl_geno <- cbind(x[seq(1,length(x),by=2)],x[seq(2,length(x),by=2)])

#differential expression by tissue
x <- paste(clust_num,geno_type,sep="_")
Idents(organoid_filtered) <- x
for(i in c(1:10)){
	d <- FindMarkers(ms_filtered,ident.1=paste(i,"_C",sep=""),ident.2=paste(i,"_K",sep=""),logfc.threshold=0,min.pct=0,min.diff.pct=0,thresh.test=0)
	d <- cbind(dimnames(d)[[1]],d)
	cluster_de <- merge(d,ens[,c(1,6)],all.x=TRUE,by=1)
	filename <- paste("./de_geno/cluster",i,"_de.csv",sep="")
	write.table(cluster_de,filename,sep=",")
	print(i)
}

##
g <- rep(0,10)
for(i in c(1:10)){
	f <- read.csv(paste("./de_geno/cluster",i,"_de.csv",sep=""))
	d <- f[,6] < .1 & abs(f[,3]) > .05
	g[i] <- countTrue(d)
}

##TF
geno_type <- substr(dimnames(ms_exp)[[2]],1,1)
c9 <- clust_num == 9
egr1_c <- ms_exp["ENSMUSG00000038418",c9 & geno_type == "C"]
egr1_k <- ms_exp["ENSMUSG00000038418",c9 & geno_type == "K"]

x <- c(egr1_c,egr1_k)
y <- factor(c(rep("egr1_c",length(egr1_c)),rep("egr1_k",length(egr1_k))))
z <- data.frame(exp=x,name=y)
p <- ggplot(z,aes(x=name,y=exp)) + geom_violin(scale="width")

sp1_c <- ms_exp["ENSMUSG00000001280",c9 & geno_type == "C"]
sp1_k <- ms_exp["ENSMUSG00000001280",c9 & geno_type == "K"]
egr1_c <- ms_exp["ENSMUSG00000038418",c9 & geno_type == "C"]
egr1_k <- ms_exp["ENSMUSG00000038418",c9 & geno_type == "K"]
sp3_c <- ms_exp["ENSMUSG00000027109",c9 & geno_type == "C"]
sp3_k <- ms_exp["ENSMUSG00000027109",c9 & geno_type == "K"]
klf12_c <- ms_exp["ENSMUSG00000072294",c9 & geno_type == "C"]
klf12_k <- ms_exp["ENSMUSG00000072294",c9 & geno_type == "K"]
thap1_c <- ms_exp["ENSMUSG00000037214",c9 & geno_type == "C"]
thap1_k <- ms_exp["ENSMUSG00000037214",c9 & geno_type == "K"]

level_order <- c("sp1_c","sp1_k","egr1_c","egr1_k","sp3_c","sp3_k","klf12_c","klf12_k","thap1_c","thap1_k")

x <- c(sp1_c,sp1_k,egr1_c,egr1_k,sp3_c,sp3_k,klf12_c,klf12_k,thap1_c,thap1_k)
y <- factor(c(rep("sp1_c",length(sp1_c)),rep("sp1_k",length(sp1_k)),rep("egr1_c",length(egr1_c)),rep("egr1_k",length(egr1_k)),rep("sp3_c",length(sp3_c)),rep("sp3_k",length(sp3_k)),rep("klf12_c",length(klf12_c)),rep("klf12_k",length(klf12_k)),rep("thap1_c",length(thap1_c)),rep("thap1_k",length(thap1_k))),levels=level_order)
y_col <-c(rep("blue",length(sp1_c)),rep("red",length(sp1_k)),rep("blue",length(egr1_c)),rep("red",length(egr1_k)),rep("blue",length(sp3_c)),rep("red",length(sp3_k)),rep("blue",length(klf12_c)),rep("red",length(klf12_k)),rep("blue",length(thap1_c)),rep("red",length(thap1_k)))
z <- data.frame(exp=x,name=y)
p <- ggplot(z,aes(x=name,y=exp)) + geom_violin(scale="width")

y_col <-c(rep(8,length(sp1_c)),rep(4,length(sp1_k)),rep(8,length(egr1_c)),rep(4,length(egr1_k)),rep(8,length(sp3_c)),rep(4,length(sp3_k)),rep(8,length(klf12_c)),rep(4,length(klf12_k)),rep(8,length(thap1_c)),rep(4,length(thap1_k)))
